{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d124d22-de73-436b-86cd-9b162b469be8",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%pip install --upgrade pip\n",
    "\n",
    "# Uninstall conflicting packages\n",
    "%pip uninstall -y langchain-core langchain-openai langchain-experimental beautifulsoup4 langchain-community langchain chromadb beautifulsoup4\n",
    "\n",
    "# Install compatible versions of langchain-core and langchain-openai\n",
    "%pip install langchain-core==0.3.6\n",
    "%pip install langchain-openai==0.2.1\n",
    "%pip install langchain-experimental==0.3.2\n",
    "%pip install langchain-community==0.3.1\n",
    "%pip install langchain==0.3.1\n",
    "\n",
    "# Install remaining packages\n",
    "%pip install chromadb==0.5.11\n",
    "%pip install beautifulsoup4==4.12.3\n",
    "\n",
    "# Restart the kernel after installation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0cd4b7a9-f8e8-4e23-9366-bdb6da2e360c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# New OS parameter to avoid warnings.  \n",
    "# This will not have a material impact on your code, but prevents warnings from appearing related to new LangChain features.\n",
    "import os\n",
    "os.environ['USER_AGENT'] = 'RAGUserAgent'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f884314f-870c-4bfb-b6c1-a5b4801ec172",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import WebBaseLoader\n",
    "import bs4\n",
    "import openai\n",
    "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
    "from langchain import hub\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.runnables import RunnablePassthrough\n",
    "import chromadb\n",
    "from langchain_community.vectorstores import Chroma\n",
    "from langchain_experimental.text_splitter import SemanticChunker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "721241b4-32ab-476a-a5ac-9feab48459e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# OpenAI Setup\n",
    "os.environ['OPENAI_API_KEY'] = ''\n",
    "openai.api_key = os.environ['OPENAI_API_KEY']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d3ad428a-3eb6-40ec-a1a5-62565ead1e5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### INDEXING ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "98ccda2c-0f4c-41c5-804d-2227cdf35aa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Documents\n",
    "loader = WebBaseLoader(\n",
    "    web_paths=(\"https://kbourne.github.io/chapter1.html\",), \n",
    "    bs_kwargs=dict(\n",
    "        parse_only=bs4.SoupStrainer(\n",
    "            class_=(\"post-content\", \"post-title\", \"post-header\")\n",
    "        )\n",
    "    ),\n",
    ")\n",
    "docs = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "927a4c65-aa05-486c-8295-2f99673e7c20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split\n",
    "text_splitter = SemanticChunker(OpenAIEmbeddings())\n",
    "splits = text_splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6b13568c-d633-464d-8c43-0d55f34cc8c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Embed\n",
    "vectorstore = Chroma.from_documents(documents=splits, \n",
    "                                    embedding=OpenAIEmbeddings())\n",
    "\n",
    "retriever = vectorstore.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6ce8df01-925b-45b5-8fb8-17b5c40c581f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### RETRIEVAL and GENERATION ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "eb47c817-b5ac-4d90-84ee-4cd209e52a80",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.11/site-packages/langsmith/client.py:323: LangSmithMissingAPIKeyWarning: API key must be provided when using hosted LangSmith API\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise\n",
    "prompt = hub.pull(\"jclemens24/rag-prompt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e8975479-b3e3-481d-ad7b-08b4eb3faaef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Post-processing\n",
    "def format_docs(docs):\n",
    "    return \"\\n\\n\".join(doc.page_content for doc in docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "deb6d70c-42ef-4bda-9607-48f02c941280",
   "metadata": {},
   "outputs": [],
   "source": [
    "# LLM\n",
    "llm = ChatOpenAI(model_name=\"gpt-4o-mini\", temperature=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "fd9db713-f705-4b65-800e-2c4e3d0e4ef4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chain it all together with LangChain\n",
    "rag_chain = (\n",
    "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
    "    | prompt\n",
    "    | llm\n",
    "    | StrOutputParser()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "8b30177a-f9ab-45e4-812d-33b0f97325bd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"The advantages of using Retrieval-Augmented Generation (RAG) include:\\n\\n1. **Improved Accuracy and Relevance**: RAG enhances the accuracy and relevance of responses generated by large language models (LLMs) by incorporating specific, real-time information from databases or datasets, ensuring outputs are based on both the model's pre-existing knowledge and the most current data.\\n\\n2. **Customization and Flexibility**: RAG allows for tailored responses based on domain-specific needs by integrating a company's internal databases into the response generation process, creating personalized experiences and outputs that meet unique business requirements.\\n\\n3. **Expanding Model Knowledge Beyond Training Data**: RAG enables models to access and utilize information that was not included in their initial training sets, effectively expanding the model's knowledge base without the need for retraining, making LLMs more versatile and adaptable to new domains or rapidly evolving topics.\""
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Question - run the chain\n",
    "rag_chain.invoke(\"What are the advantages of using RAG?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7082f647-bf11-4dee-8121-ae8c8a66cb4b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
